Load libraries

# Data manipulation and visualization 
library(tidyverse)
library(here)
library(janitor)

# Data analysis/statistics 
library(car) #ANOVA
library(ggResidpanel) #residual panel
library(ggeffects)
library(effects) #dependency for ggeffects
library(MuMIn)
library(gam)
library(mgcv)

Load data

FCM <- read_csv(here("HW11", "data", "HOT_cyto_counts.csv"))
MLDS <- read_csv(here("HW11", "data", "hot_mlds.csv"))

FCM <- clean_names(FCM)
MLDS <- clean_names(MLDS)
# reformating the date and extracting the day of the year
# pressure turned negative for easier visualization later on 
FCM <- FCM %>% mutate(cal_date = mdy(date)) %>% 
  relocate(cal_date, .after= date) %>% 
  mutate(doyear = yday(cal_date)) %>% 
  relocate(doyear, .after= cal_date) %>% 
  mutate(press = -1*press)
## Correction 
mod <- gam(log10(pro+0.01) ~ s(doyear, press, k =50), data= FCM)
gam.check(mod)

## 
## Method: GCV   Optimizer: magic
## Smoothing parameter selection converged after 5 iterations.
## The RMS GCV score gradient at convergence was 1.788017e-07 .
## The Hessian was positive definite.
## Model rank =  50 / 50 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                   k'  edf k-index p-value
## s(doyear,press) 49.0 30.7    1.08       1
plot(mod, scheme = 2, shift = coef(mod)[1], trans = exp)

Question 1

# making models of the microbes as a fucntion of pressure only 
proxdepth <- gam(sqrt(pro) ~ s(press), data= FCM)
hbactxdepth <- gam(hbact ~ s(press), data= FCM)
picoeukxdepth <- gam(sqrt(picoeuk)~ s(press), data= FCM)

par(mfrow = c(1,3))
plot(proxdepth, main = "pro", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "green")
plot(hbactxdepth, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "red")
plot(picoeukxdepth, main= "pico euk", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "royalblue")

# making models of the microbes as a fucntion of day of year only 
proxday <- gam(pro ~ s(doyear), data= FCM)
hbactxday <- gam(hbact ~ s(doyear), data= FCM)
picoeukxday <- gam(sqrt(picoeuk)~ s(doyear), data= FCM)

par(mfrow = c(1,3))
plot(proxday, main = "pro", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "green")
plot(hbactxday, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "red")
plot(picoeukxday, main = "pico euk", residuals = TRUE, cex = 4, lwd = 3, shade = TRUE, shade.col = "royalblue")

Comprehesion models for each microbe

# making the comprehensive model with pressure and day of year for prochlorococcus
pro2D <- gam(sqrt(pro) ~ s(doyear, press), data= FCM) 
pro2D_X <- gam(sqrt(pro) ~ s(doyear) + s(press), data= FCM) 
summary(pro2D)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## sqrt(pro) ~ s(doyear, press)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.037586   0.004633   223.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df     F p-value    
## s(doyear,press) 27.39  28.84 375.4  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.899   Deviance explained = 90.1%
## GCV = 0.026751  Scale est. = 0.026127  n = 1217
anova(pro2D, pro2D_X)
## Analysis of Deviance Table
## 
## Model 1: sqrt(pro) ~ s(doyear, press)
## Model 2: sqrt(pro) ~ s(doyear) + s(press)
##   Resid. Df Resid. Dev      Df Deviance      F    Pr(>F)    
## 1    1187.2     31.055                                      
## 2    1202.5     35.707 -15.339  -4.6524 11.609 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.

# making the comprehensive model with pressure and day of year for het bacteria
hbact2D <- gam(hbact ~ s(doyear, press), data= FCM) 
hbact2D_X <- gam(hbact ~ s(doyear) + s(press), data= FCM) 
summary(hbact2D)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## hbact ~ s(doyear, press)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.88022    0.01811   214.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df     F p-value    
## s(doyear,press) 22.85  26.88 123.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.732   Deviance explained = 73.7%
## GCV = 0.40709  Scale est. = 0.39911   n = 1217
anova(hbact2D, hbact2D_X)
## Analysis of Deviance Table
## 
## Model 1: hbact ~ s(doyear, press)
## Model 2: hbact ~ s(doyear) + s(press)
##   Resid. Df Resid. Dev     Df Deviance      F    Pr(>F)    
## 1    1189.1     476.20                                     
## 2    1201.9     524.42 -12.83   -48.22 9.4167 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.

# making the comprehensive model with pressure and day of year for pico eukaryotes
picoeuk2D <- gam(sqrt(picoeuk) ~ s(doyear, press), data= FCM) 
picoeuk2D_X <- gam(sqrt(picoeuk) ~ s(doyear) + s(press), data= FCM) 
summary(picoeuk2D)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## sqrt(picoeuk) ~ s(doyear, press)
## 
## Parametric coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0933431  0.0006991   133.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df     F p-value    
## s(doyear,press) 24.31  27.73 45.82  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.514   Deviance explained = 52.3%
## GCV = 0.00060737  Scale est. = 0.00059474  n = 1217
anova(picoeuk2D,picoeuk2D_X)
## Analysis of Deviance Table
## 
## Model 1: sqrt(picoeuk) ~ s(doyear, press)
## Model 2: sqrt(picoeuk) ~ s(doyear) + s(press)
##   Resid. Df Resid. Dev      Df  Deviance      F    Pr(>F)    
## 1    1188.3    0.70875                                       
## 2    1200.0    0.77479 -11.746 -0.066037 9.4531 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

The F value of the anova is significant which means that the interaction between depth and day of year explains the variation in abundance more than the variables alone.

par(mfrow = c(1,3))
plot(pro2D, main = "pro", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
plot(hbact2D, main = "hbact", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
plot(picoeuk2D, main = "pico euk", residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter

par(mfrow = c(1,3))
vis.gam(pro2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
vis.gam(hbact2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter
vis.gam(picoeuk2D, view = c("doyear", "press"), plot.type = "contour")
abline(v = 60, col = "pink",lwd = 3) #spring
abline(v = 152, col = "green",lwd = 3) #summer
abline(v = 244, col = "orange",lwd = 3) # fall
abline(v = 335, col = "skyblue",lwd = 3) # winter

Question 2

FCM_long <- FCM %>% 
  pivot_longer(cols = hbact:picoeuk,
                     names_to = "microbe", 
                     values_to = "micr_conc") %>% 
  filter(microbe != "syn") %>% 
  mutate(microbe = factor(microbe, levels= c("pro", "hbact", "picoeuk"))) 

Setting up the model

microbe_model <- gam(micr_conc ~ s(doyear, press, by = microbe) + microbe, data= FCM_long) 
summary(microbe_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## micr_conc ~ s(doyear, press, by = microbe) + microbe
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.33514    0.01210  110.36   <2e-16 ***
## microbehbact    2.54508    0.01711  148.75   <2e-16 ***
## microbepicoeuk -1.32520    0.01711  -77.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                 edf Ref.df       F p-value    
## s(doyear,press):microbepro     27.1  28.78 173.502  <2e-16 ***
## s(doyear,press):microbehbact   25.8  28.39 262.887  <2e-16 ***
## s(doyear,press):microbepicoeuk  2.0   2.00   0.013   0.987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.947   Deviance explained = 94.8%
## GCV =  0.181  Scale est. = 0.17813   n = 3651
# I don't know why my picoeukaryotes plot shows a linear trend instead of a contour plot...
par(mfrow = c(1,3))
plot(microbe_model, residuals = TRUE, cex = 4, lwd = 3, cex.main = 3)

Interpretation

(a) Do the different kinds of microbes have different mean abundances?
Yes, we saw that the estimated intercepts are different with the summary(microbe_model).

(b) Do the different kinds of microbes have different average depth distributions (i.e., averaging over time)?

# we can compare the full model to a model in which only the depth (pressure) is accounted for in the variation of abundance of the microbes
microbe_depth <- gam(micr_conc ~ s(press, by = microbe) + microbe, data= FCM_long) 
par(mfrow = c(1,3))
plot(microbe_depth)

# The F value of the anova is significant for pro and hbact which means that they are differently distributed on average by depth when averaging over time.
summary(microbe_depth)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## micr_conc ~ s(press, by = microbe) + microbe
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.33514    0.01326  100.72   <2e-16 ***
## microbehbact    2.54508    0.01875  135.77   <2e-16 ***
## microbepicoeuk -1.32520    0.01875  -70.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                           edf Ref.df       F p-value    
## s(press):microbepro     5.905  6.719 598.438  <2e-16 ***
## s(press):microbehbact   4.999  5.934 966.766  <2e-16 ***
## s(press):microbepicoeuk 1.000  1.000   0.015   0.904    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.936   Deviance explained = 93.7%
## GCV = 0.21471  Scale est. = 0.21383   n = 3651

(c) Do the different kinds of microbes have different average seasonal dynamics (i.e., averaging over depths)

# we can make a model that only accounts for the day of the year as a predictor for the variation in microbe abundance
microbe_doyear <- gam(micr_conc ~ s(doyear, by = microbe) + microbe, data= FCM_long) 
par(mfrow = c(1,3))
plot(microbe_doyear)

# the F value of the anova is only significant for hbact, which means that hetero trophic bacteria are distributed differently throughout the year. 
summary(microbe_doyear)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## micr_conc ~ s(doyear, by = microbe) + microbe
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.33514    0.02521   52.97   <2e-16 ***
## microbehbact    2.54508    0.03565   71.39   <2e-16 ***
## microbepicoeuk -1.32520    0.03565  -37.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df     F p-value    
## s(doyear):microbepro     2.077  2.581 1.922   0.131    
## s(doyear):microbehbact   6.485  7.617 8.084  <2e-16 ***
## s(doyear):microbepicoeuk 1.000  1.000 0.002   0.965    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.77   Deviance explained = 77.1%
## GCV = 0.77597  Scale est. = 0.7733    n = 3651

Question 3

MLDS<- MLDS %>% rename(cruise = crn)
full_df <- left_join(FCM, MLDS, by = c("cruise", "date"))
HOT <- full_df %>% select(-julian, -syn, -botid, -x1) %>% 
  # we only want the data of the top 45m 
  filter(press <=45) %>%  
  filter(mean  <=45) %>% 
  pivot_longer(cols = hbact:picoeuk,
                     names_to = "microbe", 
                     values_to = "micr_conc") %>% 
  mutate(microbe = factor(microbe, levels= c("pro", "hbact", "picoeuk"))) %>% 
  rename(mixlay = mean)

Mixed Layer

#ML_gam <- gam(micr_conc ~ s(press, by = microbe)+ s(doyear, mixlay, by = microbe) + microbe, data = HOT)
ML_gam <- gam(micr_conc ~ s(mixlay, by = microbe) + microbe, data = HOT)
summary(ML_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## micr_conc ~ s(mixlay, by = microbe) + microbe
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.36839    0.09561   14.31   <2e-16 ***
## microbehbact    2.47610    0.13521   18.31   <2e-16 ***
## microbepicoeuk -1.35897    0.13521  -10.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                            edf Ref.df     F  p-value    
## s(mixlay):microbepro     1.000  1.000 0.042 0.837407    
## s(mixlay):microbehbact   4.577  5.453 4.568 0.000433 ***
## s(mixlay):microbepicoeuk 1.000  1.000 0.000 0.996629    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 29/30
## R-sq.(adj) =  0.782   Deviance explained =   79%
## GCV = 0.75254  Scale est. = 0.72213   n = 237
par(mfrow = c(1,3))
plot(ML_gam)

Interpretation

We can see that the heterotrophic bacteria are significantly associated with the variation in mixed layer depth. There is a peak of hbact abundance around 32m and a low in hbact abundance at 39m.

Chlorophyll

chl_gam <- gam(micr_conc ~ s(chl, by = microbe) + microbe, data = HOT)
summary(chl_gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## micr_conc ~ s(chl, by = microbe) + microbe
## 
## Parametric coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.40333    0.09019   15.56   <2e-16 ***
## microbehbact    2.56013    0.12755   20.07   <2e-16 ***
## microbepicoeuk -1.39299    0.12755  -10.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                         edf Ref.df     F  p-value    
## s(chl):microbepro     3.895  4.825 4.976 0.000385 ***
## s(chl):microbehbact   7.952  8.711 5.825 1.46e-06 ***
## s(chl):microbepicoeuk 1.000  1.000 0.003 0.959428    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.834   Deviance explained = 84.6%
## GCV = 0.61587  Scale est. = 0.56939   n = 210
par(mfrow = c(1,3))
plot(chl_gam)

Interpretation

  • Prochlorococcus abundance vary with chlorophyll concentration. It looks like the maximum Prochlorococcus abnundance can be found wit 0.12ug/L of chlorophyll .
  • Heterotrophic bacteria varies similarly with variation of chlorophyll with a peak of hbact abundance at 0.10ug/L of chlorophyll.